Import Libraries
library("tidyverse")
library("dplyr")
Read dataset_1 - Note: Removed first column for ID values
dataset_1 <- read.csv("../raw_data/dataset_1/winemag-data-130k-v2.csv")[-1]
Combine all the csv files for dataset_2
# function reads csv values
readfile <- function(filename){
cat("Reading file: ", filename, "...\n", sep = '' )
return(read.csv(file = filename, header = TRUE))
}
# function merges two dataframes together
merge_dataframes <- function(dataframe_1, dataframe_2){
return(rbind(dataframe_1, dataframe_2))
}
# funciton merges all csv files in folder to one dataframe
combine_data = function(mypath){
filenames=list.files(path=mypath, full.names=TRUE)
datalist = lapply(filenames, function(x) readfile(x))
return(Reduce(f = function(x,y) merge_dataframes(x,y), x = datalist, accumulate = FALSE))
}
dataset_2 <- combine_data("../raw_data/dataset_2")
Reading file: ../raw_data/dataset_2/winemag-1-800.csv...
Reading file: ../raw_data/dataset_2/winemag-10401-11200.csv...
Reading file: ../raw_data/dataset_2/winemag-11201-12000.csv...
Reading file: ../raw_data/dataset_2/winemag-12001-12800.csv...
Reading file: ../raw_data/dataset_2/winemag-1601-2400.csv...
Reading file: ../raw_data/dataset_2/winemag-2401-3200.csv...
Reading file: ../raw_data/dataset_2/winemag-3201-4000.csv...
Reading file: ../raw_data/dataset_2/winemag-4001-4800.csv...
Reading file: ../raw_data/dataset_2/winemag-4801-5600.csv...
Reading file: ../raw_data/dataset_2/winemag-5601-6400.csv...
Reading file: ../raw_data/dataset_2/winemag-6401-7200.csv...
Reading file: ../raw_data/dataset_2/winemag-7201-8000.csv...
Reading file: ../raw_data/dataset_2/winemag-8001-8800.csv...
Reading file: ../raw_data/dataset_2/winemag-801-1600.csv...
Reading file: ../raw_data/dataset_2/winemag-8801-9600.csv...
Reading file: ../raw_data/dataset_2/winemag-9601-10400.csv...
Join datasets by title, description, price, country, points = rating
join_wines <- inner_join(dataset_1, dataset_2, by = c("title", "description", "points" = "rating", "price", "country"))
Column `title` joining factors with different levels, coercing to character vectorColumn `description` joining factors with different levels, coercing to character vectorColumn `country` joining factors with different levels, coercing to character vector
Visualize Joined data
head(join_wines)
Output uncleaned joined data for others to use
write_csv(join_wines, "joined_datasets.csv")
Clean joined data to remove redundancies
join_wines_cleaned <- join_wines %>%
select(country, description, designation=designation.x, points, price, province, region=region_1, subregion=region_2, taster_name, taster_twitter_handle, title, variety, winery=winery.x, alcohol, category, url, vintage)
head(join_wines_cleaned)
Outout cleaned joined data for others to use
write_csv(join_wines_cleaned, "joined_datasets_cleaned.csv")
Extract relevant information for our use
wines <- join_wines_cleaned %>%
select(title, alcohol, category, vintage, designation, country, province, region, subregion, variety, winery, price, points, taster_name, taster_twitter_handle)
Output for easy import later
write_csv(wines, "wines.csv")